{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "49ee922e",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer, AutoConfig, LlamaForCausalLM, AutoModelForCausalLM\n",
    "from transformers import LlamaConfig\n",
    "from huggingface_hub import snapshot_download\n",
    "from safetensors.torch import load_file, save_file, safe_open\n",
    "from glob import glob\n",
    "from tqdm import tqdm\n",
    "import os\n",
    "import torch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c79d19b5",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You are using a model of type mistral to instantiate a model of type llama. This is not supported for all configurations of models and can yield errors.\n",
      "The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation=\"flash_attention_2\"` instead.\n",
      "You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "fed6f0333a85452bb15c90a615fd48c7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "model = LlamaForCausalLM.from_pretrained(\n",
    "    'mesolitica/malaysian-mistral-7b-32k-instructions-v5',\n",
    "    config = LlamaConfig.from_pretrained('mesolitica/malaysian-mistral-7b-32k-instructions-v5'),\n",
    "    use_flash_attention_2 = True,\n",
    "    torch_dtype = torch.float16\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ed3c6ad3",
   "metadata": {},
   "outputs": [],
   "source": [
    "_ = model.cuda()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "c2403612",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<s> [INST] awak teloq [/INST]Maaf, saya tidak memahami perkataan \"awak teloq\" dalam bahasa Melayu. Bolehkah anda mengulangi atau memberikan maklumat lanjut?</s>\n"
     ]
    }
   ],
   "source": [
    "s = \"\"\"\n",
    "awak teloq\n",
    "\"\"\"\n",
    "\n",
    "messages = [\n",
    "    {'role': 'user', 'content': s.strip()},\n",
    "]\n",
    "prompt = tokenizer.apply_chat_template(messages, tokenize = False)\n",
    "inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda')\n",
    "generate_kwargs = dict(\n",
    "    inputs,\n",
    "    max_new_tokens=128,\n",
    "    top_p=0.95,\n",
    "    top_k=50,\n",
    "    temperature=0.9,\n",
    "    do_sample=True,\n",
    "    num_beams=1,\n",
    ")\n",
    "r = model.generate(**generate_kwargs)\n",
    "print(tokenizer.decode(r[0]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "69ea1413",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
